# -*- coding: utf-8 -*-
"""
Created on Sun Jul 29 15:48:35 2018

@author: 君
"""
from lxml import etree
import urllib.request as r
import re

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.82 Safari/537.36',
            'X-Requested-With':'XMLHttpRequest'}


#爬取211高校
schoolLists211 = {}
#参数：0-0-0-0-1-0-0--p-{}，院校属地，院校分类，办学性质（公办1，民办2），办学层次（本科1，专科2），办学特色（不限0,211工程1,985工程2，自主招生3，研究所院4，含国防生5，卓越计划6），排名方式（无0，硕士点数1，博士点数2，高考派排名3，影响力4），p-最后一个为页码
for j in range(1,18):
    url = 'http://www.gaokaopai.com/daxue-0-0-0-0-1-0-0--p-{}.html'.format(j)
    req = r.Request(url,headers=headers)
    data = r.urlopen(req).read().decode('utf-8','ignore')
    schoolList = etree.HTML(data).xpath("//div[@class='tit']/h3/a/text()")
    
    for i in schoolList:
        i = re.sub(r"\（.*?\）", '', i)
        schoolLists211[i] = 0
    print("完成"+str(j)+"页")
aaa = list(schoolLists211.keys())
print(aaa)
